import numpy as np
import pandas as pd
import seaborn as sb
import matplotlib.pyplot as plt
from sklearn.metrics import mean_absolute_error as mae
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LinearRegression
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
import warnings
warnings.filterwarnings('ignore')
total_bill: Total bill in dollars including taxes tip: Tip given to waiters in dollars sex: gender of the person paying the bill smoker: whether the person smoked or not day: day of the week time: lunch or dinner size: number of people in a table So this is the data recorded by the restaurant. Based on this data, our task is to find the factors affecting waiter tips and train a machine learning model to predict the waiter’s tipping.
df = pd.read_csv('tips.csv')
df.head()
| total_bill | tip | sex | smoker | day | time | size | |
|---|---|---|---|---|---|---|---|
| 0 | 16.99 | 1.01 | Female | No | Sun | Dinner | 2 |
| 1 | 10.34 | 1.66 | Male | No | Sun | Dinner | 3 |
| 2 | 21.01 | 3.50 | Male | No | Sun | Dinner | 3 |
| 3 | 23.68 | 3.31 | Male | No | Sun | Dinner | 2 |
| 4 | 24.59 | 3.61 | Female | No | Sun | Dinner | 4 |
df.shape
(244, 7)
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 244 entries, 0 to 243 Data columns (total 7 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 total_bill 244 non-null float64 1 tip 244 non-null float64 2 sex 244 non-null object 3 smoker 244 non-null object 4 day 244 non-null object 5 time 244 non-null object 6 size 244 non-null int64 dtypes: float64(2), int64(1), object(4) memory usage: 13.5+ KB
df.describe().T
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| total_bill | 244.0 | 19.785943 | 8.902412 | 3.07 | 13.3475 | 17.795 | 24.1275 | 50.81 |
| tip | 244.0 | 2.998279 | 1.383638 | 1.00 | 2.0000 | 2.900 | 3.5625 | 10.00 |
| size | 244.0 | 2.569672 | 0.951100 | 1.00 | 2.0000 | 2.000 | 3.0000 | 6.00 |
df.size
1708
df.columns
Index(['total_bill', 'tip', 'sex', 'smoker', 'day', 'time', 'size'], dtype='object')
df.isnull()
| total_bill | tip | sex | smoker | day | time | size | |
|---|---|---|---|---|---|---|---|
| 0 | False | False | False | False | False | False | False |
| 1 | False | False | False | False | False | False | False |
| 2 | False | False | False | False | False | False | False |
| 3 | False | False | False | False | False | False | False |
| 4 | False | False | False | False | False | False | False |
| ... | ... | ... | ... | ... | ... | ... | ... |
| 239 | False | False | False | False | False | False | False |
| 240 | False | False | False | False | False | False | False |
| 241 | False | False | False | False | False | False | False |
| 242 | False | False | False | False | False | False | False |
| 243 | False | False | False | False | False | False | False |
244 rows × 7 columns
df.isnull().sum()
total_bill 0 tip 0 sex 0 smoker 0 day 0 time 0 size 0 dtype: int64
plt.subplots(figsize=(15,8))
for i, col in enumerate(['total_bill', 'tip']):
plt.subplot(2,3, i + 1)
sb.distplot(df[col])
plt.tight_layout()
plt.show()
import seaborn as sns
sns.pairplot(data = df, hue = 'tip')
<seaborn.axisgrid.PairGrid at 0x2972dc881d0>
pd.plotting.scatter_matrix(df)
array([[<Axes: xlabel='total_bill', ylabel='total_bill'>,
<Axes: xlabel='tip', ylabel='total_bill'>,
<Axes: xlabel='size', ylabel='total_bill'>],
[<Axes: xlabel='total_bill', ylabel='tip'>,
<Axes: xlabel='tip', ylabel='tip'>,
<Axes: xlabel='size', ylabel='tip'>],
[<Axes: xlabel='total_bill', ylabel='size'>,
<Axes: xlabel='tip', ylabel='size'>,
<Axes: xlabel='size', ylabel='size'>]], dtype=object)
import plotly.express as px
import plotly.graph_objects as go
figure = px.scatter(data_frame = df, x="total_bill",
y="tip", size="size", color= "day", trendline="ols")
figure.show()
figure = px.scatter(data_frame = df, x="total_bill",
y="tip", size="size", color= "sex", trendline="ols")
figure.show()
figure = px.scatter(data_frame = df, x="total_bill",
y="tip", size="size", color= "time", trendline="ols")
figure.show()
figure = px.pie(df, values='tip', names='day',hole = 0.5)
figure.show()
figure = px.pie(df, values='tip',names='sex',hole = 0.5)
figure.show()
figure = px.pie(df, values='tip',names='smoker',hole = 0.5)
figure.show()
figure = px.pie(df,
values='tip',
names='time',hole = 0.5)
figure.show()
# Assuming 'df' is your DataFrame
# Drop non-numeric columns
numeric_df = df.select_dtypes(include=['number'])
numeric_df
| total_bill | tip | size | |
|---|---|---|---|
| 0 | 16.99 | 1.01 | 2 |
| 1 | 10.34 | 1.66 | 3 |
| 2 | 21.01 | 3.50 | 3 |
| 3 | 23.68 | 3.31 | 2 |
| 4 | 24.59 | 3.61 | 4 |
| ... | ... | ... | ... |
| 239 | 29.03 | 5.92 | 3 |
| 240 | 27.18 | 2.00 | 2 |
| 241 | 22.67 | 2.00 | 2 |
| 242 | 17.82 | 1.75 | 2 |
| 243 | 18.78 | 3.00 | 2 |
244 rows × 3 columns
# Compute correlation matrix
correlation_matrix = numeric_df.corr()
correlation_matrix
| total_bill | tip | size | |
|---|---|---|---|
| total_bill | 1.000000 | 0.675734 | 0.598315 |
| tip | 0.675734 | 1.000000 | 0.489299 |
| size | 0.598315 | 0.489299 | 1.000000 |
# Plot heatmap
import seaborn as sns
import matplotlib.pyplot as plt
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, cmap='YlGnBu', annot=True)
plt.show()
plt.subplots(figsize=(15,8))
for i, col in enumerate(['total_bill', 'tip']):
plt.subplot(2,3, i + 1)
sb.boxplot(df[col])
plt.tight_layout()
plt.show()
df.shape, df[(df['total_bill']<45) & (df['tip']<7)].shape
((244, 7), (238, 7))
import seaborn as sb
import matplotlib.pyplot as plt
feat = df.loc[:, 'sex':'size'].columns
plt.subplots(figsize=(15, 8))
for i, col in enumerate(feat):
plt.subplot(2, 3, i + 1)
sb.countplot(data=df, x=col)
plt.tight_layout()
plt.show()
plt.scatter(df['total_bill'], df['tip'])
plt.title('Total Bill v/s Total Tip')
plt.xlabel('Total Bill')
plt.ylabel('Total Tip')
plt.show()
# Convert 'size' column to numeric type if it's not already numeric
df['size'] = pd.to_numeric(df['size'], errors='coerce')
df['size']
0 2
1 3
2 3
3 2
4 4
..
239 3
240 2
241 2
242 2
243 2
Name: size, Length: 244, dtype: int64
# Drop rows with NaN values in the 'size' column if needed
df = df.dropna(subset=['size'])
df
| total_bill | tip | sex | smoker | day | time | size | |
|---|---|---|---|---|---|---|---|
| 0 | 16.99 | 1.01 | Female | No | Sun | Dinner | 2 |
| 1 | 10.34 | 1.66 | Male | No | Sun | Dinner | 3 |
| 2 | 21.01 | 3.50 | Male | No | Sun | Dinner | 3 |
| 3 | 23.68 | 3.31 | Male | No | Sun | Dinner | 2 |
| 4 | 24.59 | 3.61 | Female | No | Sun | Dinner | 4 |
| ... | ... | ... | ... | ... | ... | ... | ... |
| 239 | 29.03 | 5.92 | Male | No | Sat | Dinner | 3 |
| 240 | 27.18 | 2.00 | Female | Yes | Sat | Dinner | 2 |
| 241 | 22.67 | 2.00 | Male | Yes | Sat | Dinner | 2 |
| 242 | 17.82 | 1.75 | Male | No | Sat | Dinner | 2 |
| 243 | 18.78 | 3.00 | Female | No | Thur | Dinner | 2 |
244 rows × 7 columns
df["sex"] = df["sex"].map({"Female": 0, "Male": 1})
df["smoker"] = df["smoker"].map({"No": 0, "Yes": 1})
df["day"] = df["day"].map({"Thur": 0, "Fri": 1, "Sat": 2, "Sun": 3})
df["time"] = df["time"].map({"Lunch": 0, "Dinner": 1})
df.head()
| total_bill | tip | sex | smoker | day | time | size | |
|---|---|---|---|---|---|---|---|
| 0 | 16.99 | 1.01 | 0 | 0 | 3 | 1 | 2 |
| 1 | 10.34 | 1.66 | 1 | 0 | 3 | 1 | 3 |
| 2 | 21.01 | 3.50 | 1 | 0 | 3 | 1 | 3 |
| 3 | 23.68 | 3.31 | 1 | 0 | 3 | 1 | 2 |
| 4 | 24.59 | 3.61 | 0 | 0 | 3 | 1 | 4 |
df.corr()
| total_bill | tip | sex | smoker | day | time | size | |
|---|---|---|---|---|---|---|---|
| total_bill | 1.000000 | 0.675734 | 0.144877 | 0.085721 | 0.173693 | 0.183118 | 0.598315 |
| tip | 0.675734 | 1.000000 | 0.088862 | 0.005929 | 0.135499 | 0.121629 | 0.489299 |
| sex | 0.144877 | 0.088862 | 1.000000 | 0.002816 | 0.230791 | 0.205231 | 0.086195 |
| smoker | 0.085721 | 0.005929 | 0.002816 | 1.000000 | -0.032653 | 0.054921 | -0.133178 |
| day | 0.173693 | 0.135499 | 0.230791 | -0.032653 | 1.000000 | 0.874366 | 0.165350 |
| time | 0.183118 | 0.121629 | 0.205231 | 0.054921 | 0.874366 | 1.000000 | 0.103411 |
| size | 0.598315 | 0.489299 | 0.086195 | -0.133178 | 0.165350 | 0.103411 | 1.000000 |
plt.figure(figsize=(7,7))
sb.heatmap(df.corr() > 0.7, annot = True, cmap='YlGnBu')
plt.show()
features = df.drop('tip', axis=1)
target = df['tip']
X_train, X_val, Y_train, Y_val = train_test_split(features, target, test_size=0.2, random_state=22)
X_train.shape, X_val.shape
((195, 6), (49, 6))
X_train
| total_bill | sex | smoker | day | time | size | |
|---|---|---|---|---|---|---|
| 40 | 16.04 | 1 | 0 | 2 | 1 | 3 |
| 176 | 17.89 | 1 | 1 | 3 | 1 | 2 |
| 18 | 16.97 | 0 | 0 | 3 | 1 | 3 |
| 229 | 22.12 | 0 | 1 | 2 | 1 | 2 |
| 50 | 12.54 | 1 | 0 | 3 | 1 | 2 |
| ... | ... | ... | ... | ... | ... | ... |
| 100 | 11.35 | 0 | 1 | 1 | 1 | 2 |
| 192 | 28.44 | 1 | 1 | 0 | 0 | 2 |
| 44 | 30.40 | 1 | 0 | 3 | 1 | 4 |
| 132 | 11.17 | 0 | 0 | 0 | 0 | 2 |
| 117 | 10.65 | 0 | 0 | 0 | 0 | 2 |
195 rows × 6 columns
X_val
| total_bill | sex | smoker | day | time | size | |
|---|---|---|---|---|---|---|
| 200 | 18.71 | 1 | 1 | 0 | 0 | 3 |
| 112 | 38.07 | 1 | 0 | 3 | 1 | 3 |
| 128 | 11.38 | 0 | 0 | 0 | 0 | 2 |
| 179 | 34.63 | 1 | 1 | 3 | 1 | 2 |
| 49 | 18.04 | 1 | 0 | 3 | 1 | 2 |
| 15 | 21.58 | 1 | 0 | 3 | 1 | 2 |
| 184 | 40.55 | 1 | 1 | 3 | 1 | 2 |
| 213 | 13.27 | 0 | 1 | 2 | 1 | 2 |
| 32 | 15.06 | 0 | 0 | 2 | 1 | 2 |
| 55 | 19.49 | 1 | 0 | 3 | 1 | 2 |
| 36 | 16.31 | 1 | 0 | 2 | 1 | 3 |
| 174 | 16.82 | 1 | 1 | 3 | 1 | 2 |
| 171 | 15.81 | 1 | 1 | 2 | 1 | 2 |
| 207 | 38.73 | 1 | 1 | 2 | 1 | 4 |
| 28 | 21.70 | 1 | 0 | 2 | 1 | 2 |
| 154 | 19.77 | 1 | 0 | 3 | 1 | 4 |
| 85 | 34.83 | 0 | 0 | 0 | 0 | 4 |
| 94 | 22.75 | 0 | 0 | 1 | 1 | 2 |
| 12 | 15.42 | 1 | 0 | 3 | 1 | 2 |
| 190 | 15.69 | 1 | 1 | 3 | 1 | 2 |
| 183 | 23.17 | 1 | 1 | 3 | 1 | 4 |
| 26 | 13.37 | 1 | 0 | 2 | 1 | 2 |
| 41 | 17.46 | 1 | 0 | 3 | 1 | 2 |
| 142 | 41.19 | 1 | 0 | 0 | 0 | 5 |
| 46 | 22.23 | 1 | 0 | 3 | 1 | 2 |
| 78 | 22.76 | 1 | 0 | 0 | 0 | 2 |
| 71 | 17.07 | 0 | 0 | 2 | 1 | 3 |
| 56 | 38.01 | 1 | 1 | 2 | 1 | 4 |
| 140 | 17.47 | 0 | 0 | 0 | 0 | 2 |
| 224 | 13.42 | 1 | 1 | 1 | 0 | 2 |
| 2 | 21.01 | 1 | 0 | 3 | 1 | 3 |
| 20 | 17.92 | 1 | 0 | 2 | 1 | 2 |
| 75 | 10.51 | 1 | 0 | 2 | 1 | 2 |
| 165 | 24.52 | 1 | 0 | 3 | 1 | 3 |
| 87 | 18.28 | 1 | 0 | 0 | 0 | 2 |
| 148 | 9.78 | 1 | 0 | 0 | 0 | 2 |
| 30 | 9.55 | 1 | 0 | 2 | 1 | 2 |
| 110 | 14.00 | 1 | 0 | 2 | 1 | 2 |
| 218 | 7.74 | 1 | 1 | 2 | 1 | 2 |
| 238 | 35.83 | 0 | 0 | 2 | 1 | 3 |
| 186 | 20.90 | 0 | 1 | 3 | 1 | 3 |
| 90 | 28.97 | 1 | 1 | 1 | 1 | 2 |
| 61 | 13.81 | 1 | 1 | 2 | 1 | 2 |
| 114 | 25.71 | 0 | 0 | 3 | 1 | 3 |
| 73 | 25.28 | 0 | 1 | 2 | 1 | 2 |
| 153 | 24.55 | 1 | 0 | 3 | 1 | 4 |
| 178 | 9.60 | 0 | 1 | 3 | 1 | 2 |
| 95 | 40.17 | 1 | 1 | 1 | 1 | 4 |
| 189 | 23.10 | 1 | 1 | 3 | 1 | 3 |
Y_train
40 2.24
176 2.00
18 3.50
229 2.88
50 2.50
...
100 2.50
192 2.56
44 5.60
132 1.50
117 1.50
Name: tip, Length: 195, dtype: float64
Y_val
200 4.00 112 4.00 128 2.00 179 3.55 49 3.00 15 3.92 184 3.00 213 2.50 32 3.00 55 3.51 36 2.00 174 4.00 171 3.16 207 3.00 28 4.30 154 2.00 85 5.17 94 3.25 12 1.57 190 1.50 183 6.50 26 2.00 41 2.54 142 5.00 46 5.00 78 3.00 71 3.00 56 3.00 140 3.50 224 1.58 2 3.50 20 4.08 75 1.25 165 3.48 87 4.00 148 1.73 30 1.45 110 3.00 218 1.44 238 4.67 186 3.50 90 3.00 61 2.00 114 4.00 73 5.00 153 2.00 178 4.00 95 4.73 189 4.00 Name: tip, dtype: float64
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)
X_train
array([[-0.37642936, 0.79056942, -0.78202957, 0.29530609, 0.65865281,
0.43387166],
[-0.16555807, 0.79056942, 1.27872403, 1.15477905, 0.65865281,
-0.58546537],
[-0.2704238 , -1.26491106, -0.78202957, 1.15477905, 0.65865281,
0.43387166],
...,
[ 1.26038778, 0.79056942, -0.78202957, 1.15477905, 0.65865281,
1.45320869],
[-0.93153378, -1.26491106, -0.78202957, -1.42363982, -1.51825055,
-0.58546537],
[-0.99080571, -1.26491106, -0.78202957, -1.42363982, -1.51825055,
-0.58546537]])
X_val
array([[-0.0720908 , 0.79056942, 1.27872403, -1.42363982, -1.51825055,
0.43387166],
[ 2.13464874, 0.79056942, -0.78202957, 1.15477905, 0.65865281,
0.43387166],
[-0.90759704, -1.26491106, -0.78202957, -1.42363982, -1.51825055,
-0.58546537],
[ 1.74254213, 0.79056942, 1.27872403, 1.15477905, 0.65865281,
-0.58546537],
[-0.1484604 , 0.79056942, -0.78202957, 1.15477905, 0.65865281,
-0.58546537],
[ 0.25504466, 0.79056942, -0.78202957, 1.15477905, 0.65865281,
-0.58546537],
[ 2.41733025, 0.79056942, 1.27872403, 1.15477905, 0.65865281,
-0.58546537],
[-0.69216637, -1.26491106, 1.27872403, 0.29530609, 0.65865281,
-0.58546537],
[-0.48813415, -1.26491106, -0.78202957, 0.29530609, 0.65865281,
-0.58546537],
[ 0.01681709, 0.79056942, -0.78202957, 1.15477905, 0.65865281,
-0.58546537],
[-0.34565355, 0.79056942, -0.78202957, 0.29530609, 0.65865281,
0.43387166],
[-0.28752147, 0.79056942, 1.27872403, 1.15477905, 0.65865281,
-0.58546537],
[-0.40264579, 0.79056942, 1.27872403, 0.29530609, 0.65865281,
-0.58546537],
[ 2.2098785 , 0.79056942, 1.27872403, 0.29530609, 0.65865281,
1.45320869],
[ 0.2687228 , 0.79056942, -0.78202957, 0.29530609, 0.65865281,
-0.58546537],
[ 0.04873275, 0.79056942, -0.78202957, 1.15477905, 0.65865281,
1.45320869],
[ 1.76533902, -1.26491106, -0.78202957, -1.42363982, -1.51825055,
1.45320869],
[ 0.3884065 , -1.26491106, -0.78202957, -0.56416686, 0.65865281,
-0.58546537],
[-0.44709974, 0.79056942, -0.78202957, 1.15477905, 0.65865281,
-0.58546537],
[-0.41632393, 0.79056942, 1.27872403, 1.15477905, 0.65865281,
-0.58546537],
[ 0.43627998, 0.79056942, 1.27872403, 1.15477905, 0.65865281,
1.45320869],
[-0.68076792, 0.79056942, -0.78202957, 0.29530609, 0.65865281,
-0.58546537],
[-0.2145714 , 0.79056942, -0.78202957, 1.15477905, 0.65865281,
-0.58546537],
[ 2.49028032, 0.79056942, -0.78202957, -1.42363982, -1.51825055,
2.47254572],
[ 0.32913457, 0.79056942, -0.78202957, 1.15477905, 0.65865281,
-0.58546537],
[ 0.38954635, 0.79056942, -0.78202957, -1.42363982, -1.51825055,
-0.58546537],
[-0.25902535, -1.26491106, -0.78202957, 0.29530609, 0.65865281,
0.43387166],
[ 2.12780967, 0.79056942, 1.27872403, 0.29530609, 0.65865281,
1.45320869],
[-0.21343156, -1.26491106, -0.78202957, -1.42363982, -1.51825055,
-0.58546537],
[-0.6750687 , 0.79056942, 1.27872403, -0.56416686, -1.51825055,
-0.58546537],
[ 0.1900735 , 0.79056942, -0.78202957, 1.15477905, 0.65865281,
0.43387166],
[-0.16213854, 0.79056942, -0.78202957, 0.29530609, 0.65865281,
-0.58546537],
[-1.00676354, 0.79056942, -0.78202957, 0.29530609, 0.65865281,
-0.58546537],
[ 0.59015903, 0.79056942, -0.78202957, 1.15477905, 0.65865281,
0.43387166],
[-0.12110413, 0.79056942, -0.78202957, -1.42363982, -1.51825055,
-0.58546537],
[-1.08997221, 0.79056942, -0.78202957, -1.42363982, -1.51825055,
-0.58546537],
[-1.11618864, 0.79056942, -0.78202957, 0.29530609, 0.65865281,
-0.58546537],
[-0.6089577 , 0.79056942, -0.78202957, 0.29530609, 0.65865281,
-0.58546537],
[-1.32250055, 0.79056942, 1.27872403, 0.29530609, 0.65865281,
-0.58546537],
[ 1.8793235 , -1.26491106, -0.78202957, 0.29530609, 0.65865281,
0.43387166],
[ 0.17753521, -1.26491106, 1.27872403, 1.15477905, 0.65865281,
0.43387166],
[ 1.09738997, 0.79056942, 1.27872403, -0.56416686, 0.65865281,
-0.58546537],
[-0.63061475, 0.79056942, 1.27872403, 0.29530609, 0.65865281,
-0.58546537],
[ 0.72580056, -1.26491106, -0.78202957, 1.15477905, 0.65865281,
0.43387166],
[ 0.67678724, -1.26491106, 1.27872403, 0.29530609, 0.65865281,
-0.58546537],
[ 0.59357857, 0.79056942, -0.78202957, 1.15477905, 0.65865281,
1.45320869],
[-1.11048942, -1.26491106, 1.27872403, 1.15477905, 0.65865281,
-0.58546537],
[ 2.37401615, 0.79056942, 1.27872403, -0.56416686, 0.65865281,
1.45320869],
[ 0.42830107, 0.79056942, 1.27872403, 1.15477905, 0.65865281,
0.43387166]])
models = [LinearRegression(), XGBRegressor(), RandomForestRegressor(), AdaBoostRegressor()]
for i in range(4):
models[i].fit(X_train, Y_train)
print(f'{models[i]} : ')
pred_train = models[i].predict(X_train)
print('Training Accuracy : ', mae(Y_train, pred_train))
pred_val = models[i].predict(X_val)
print('Validation Accuracy : ', mae(Y_val, pred_val))
print()
LinearRegression() :
Training Accuracy : 0.7119950102059002
Validation Accuracy : 0.8394837715187264
XGBRegressor(base_score=None, booster=None, callbacks=None,
colsample_bylevel=None, colsample_bynode=None,
colsample_bytree=None, device=None, early_stopping_rounds=None,
enable_categorical=False, eval_metric=None, feature_types=None,
gamma=None, grow_policy=None, importance_type=None,
interaction_constraints=None, learning_rate=None, max_bin=None,
max_cat_threshold=None, max_cat_to_onehot=None,
max_delta_step=None, max_depth=None, max_leaves=None,
min_child_weight=None, missing=nan, monotone_constraints=None,
multi_strategy=None, n_estimators=None, n_jobs=None,
num_parallel_tree=None, random_state=None, ...) :
Training Accuracy : 0.02740513884715545
Validation Accuracy : 0.8656760615718607
RandomForestRegressor() :
Training Accuracy : 0.2995215384615385
Validation Accuracy : 0.7950938775510201
AdaBoostRegressor() :
Training Accuracy : 0.6440256340358487
Validation Accuracy : 0.8072559727229153